Making date.factor variable :

data.ww2$date.factor <- 0

i=0

for ( i in 1:10586){
  if (data.ww2[i,2] <= 1949 ) {
    data.ww2[i,8] <- '1940s'}
  
  else if (data.ww2[i,2] <= 1959){
     data.ww2[i,8] <- '1950s'
  }
   else if (data.ww2[i,2] <= 1969){
     data.ww2[i,8] <- '1960s'
   }
   else if (data.ww2[i,2] <= 1979){
     data.ww2[i,8] <- '1970s'
   }
   else if (data.ww2[i,2] <= 1989){
     data.ww2[i,8] <- '1980s'
   }
   else if (data.ww2[i,2] <= 1999){
     data.ww2[i,8] <- '1990s'
   }
   else if (data.ww2[i,2] <= 2009){
     data.ww2[i,8] <- '2000s'
   }
   else{ data.ww2[i,8] <- '2010s'
  }
}



data.ww2$date<-as.factor(data.ww2$date)

data.ww2$date.factor<-as.factor(data.ww2$date.factor)

str(data.ww2)
## 'data.frame':    10586 obs. of  8 variables:
##  $ key                   : chr  "Afghanistan - 1940" "Albania - 1940" "Algeria - 1940" "Angola - 1940" ...
##  $ date                  : Factor w/ 79 levels "1940","1941",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ country               : Factor w/ 193 levels "Afghanistan",..: 1 2 3 5 7 9 10 13 14 17 ...
##  $ continent             : Factor w/ 5 levels "Africa","Americas",..: 3 4 1 1 2 5 4 3 3 4 ...
##  $ gdp.number            : int  976 1820 3500 2330 7290 10100 7530 5570 1180 7090 ...
##  $ life.expectancy.number: num  31.4 42.2 37.1 33 58.4 66.5 57.8 34.4 33.5 56 ...
##  $ population.number     : int  7030000 1120000 7800000 4090000 14200000 7050000 6700000 105000 37400000 8340000 ...
##  $ date.factor           : Factor w/ 8 levels "1940s","1950s",..: 1 1 1 1 1 1 1 1 1 1 ...

Seeing plots by ggpairs:

ggpairs(data.ww2[,c(5:6,8)])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Seeing plots by plotly:

#plot_ly(data.ww2, x= ~log(gdp.number) ,y=~life.expectancy.number , z=~date, type= 'scatter3d' , marker= list(size=1.5))

1. Method: Modeling without continent information:

(1-1) model with linear:

data.ww2.lm = lm(life.expectancy.number ~ log10(gdp.number) + date, data = data.ww2 )
data.ww2.lm.df = augment(data.ww2.lm)

colnames(data.ww2.lm.df) = c("life.expectancy.number" , "gdp.number" ,  "date"  ,   ".fitted"  , ".se.fit" ,  ".resid" , ".hat" ,   ".sigma" ,".cooksd"  ,   ".std.resid"  )

ggplot(data.ww2.lm.df, aes(x = gdp.number, y = .resid)) + 
  geom_smooth(se=F) +geom_point(alpha=0.1) +
  ggtitle('Residual Plot for  Linear Model ') +
  labs(x='log10 of gdp')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data.ww2.lm.df, aes(x = .fitted, y = abs(.resid))) + 
  geom_smooth(se=F) +geom_point(alpha=0.1)+
  ggtitle('Residual & Fitted Plot for Linear Model ') +
  labs(x='log10 of gdp')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

-not good residual plot

Make predictions:

ww2.lm.grid <-expand.grid(gdp.number = seq(5.5, 11.6, 0.1), date = seq(1940,2018,1))

ww2.lm.grid$date<-as.factor(ww2.lm.grid$date)


ww2.lm.grid.predict = predict(data.ww2.lm, newdata = ww2.lm.grid)
ww2.lm.grid.predict = data.frame(ww2.lm.grid, life.expectancy = as.vector(ww2.lm.grid.predict))


#ggplot(ww2.lm.grid.predict, aes(x = gdp.number, y = life.expectancy)) +  geom_line() +facet_wrap(~date)



ggplot(ww2.lm.grid.predict, aes(x = gdp.number, y = life.expectancy, group = date, color = date)) + geom_line() +
  ggtitle('Predicted Life Expectancy per GDP after WW2' ) + 
labs(x= 'Log 10 of GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model with seperate dates')

-looks different in every date / date has effect!

(1-2) model with liner : using date.factors

data.ww2.lm.f = lm(life.expectancy.number ~ log10(gdp.number) + date.factor, data = data.ww2)
data.ww2.lm.f.df = augment(data.ww2.lm.f)

colnames(data.ww2.lm.f.df) = c("life.expectancy.number" , "gdp.number" ,  "date"  ,   ".fitted"  , ".se.fit" ,  ".resid" , ".hat" ,   ".sigma" ,".cooksd"  ,   ".std.resid"  )

ggplot(data.ww2.lm.f.df, aes(x = gdp.number, y = .resid)) + 
  geom_smooth(se=F) +geom_point(alpha=0.1) +
  ggtitle('Residual Plot for  Linear Model ') +
  labs(x='log10 of gdp')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data.ww2.lm.df, aes(x = .fitted, y = abs(.resid))) + 
  geom_smooth(se=F) +geom_point(alpha=0.1)+
  ggtitle('Residual & Fitted Plot for Linear Model ') +
  labs(x='log10 of gdp')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Make predictions:

ww2.lm.f.grid <-expand.grid(gdp.number = seq(5.5, 11.6, 0.1), date.factor = c('1940s','1950s','1960s','1970s','1980s','1990s','2010s'))

ww2.lm.f.grid$date.factor<-as.factor(ww2.lm.f.grid$date.factor)


ww2.lm.f.grid.predict = predict(data.ww2.lm.f, newdata = ww2.lm.f.grid)
ww2.lm.f.grid.predict = data.frame(ww2.lm.f.grid, life.expectancy = as.vector(ww2.lm.f.grid.predict))



#ggplot(ww2.lm.f.grid.predict, aes(x = gdp.number, y = life.expectancy)) +  geom_line() +facet_wrap(~date.factor)

ggplot(ww2.lm.f.grid.predict, aes(x = gdp.number, y = life.expectancy, group = date.factor, color = date.factor)) + geom_line()+
  ggtitle('Predicted Life Expectancy per GDP after WW2' ) + 
  labs(x= 'Log 10 of GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model with dates together')

Check the R-squared value for each model:

summary(data.ww2.lm) #Adjusted R-squared:  0.7931 
## 
## Call:
## lm(formula = life.expectancy.number ~ log10(gdp.number) + date, 
##     data = data.ww2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.953  -3.498   0.250   4.098  19.428 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -13.37760    0.66294 -20.179  < 2e-16 ***
## log10(gdp.number)  16.35201    0.11934 137.016  < 2e-16 ***
## date1941           -0.26325    0.74836  -0.352  0.72501    
## date1942           -0.59908    0.74836  -0.801  0.42343    
## date1943           -0.75554    0.74836  -1.010  0.31271    
## date1944           -1.17385    0.74836  -1.569  0.11678    
## date1945            0.04849    0.74836   0.065  0.94834    
## date1946            2.83863    0.74836   3.793  0.00015 ***
## date1947            4.08149    0.74836   5.454 5.04e-08 ***
## date1948            5.28258    0.74837   7.059 1.79e-12 ***
## date1949            6.16455    0.74838   8.237  < 2e-16 ***
## date1950            7.24860    0.74839   9.686  < 2e-16 ***
## date1951            7.30784    0.74840   9.765  < 2e-16 ***
## date1952            7.85601    0.74841  10.497  < 2e-16 ***
## date1953            8.19800    0.74844  10.954  < 2e-16 ***
## date1954            8.59137    0.74846  11.479  < 2e-16 ***
## date1955            8.95452    0.74848  11.964  < 2e-16 ***
## date1956            9.24828    0.74851  12.356  < 2e-16 ***
## date1957            9.51004    0.74854  12.705  < 2e-16 ***
## date1958           10.01966    0.74856  13.385  < 2e-16 ***
## date1959           10.24416    0.74860  13.684  < 2e-16 ***
## date1960           10.50180    0.74865  14.028  < 2e-16 ***
## date1961           10.86701    0.74868  14.515  < 2e-16 ***
## date1962           11.14669    0.74873  14.887  < 2e-16 ***
## date1963           11.46295    0.74878  15.309  < 2e-16 ***
## date1964           11.70125    0.74885  15.626  < 2e-16 ***
## date1965           11.87362    0.74891  15.854  < 2e-16 ***
## date1966           12.11740    0.74897  16.179  < 2e-16 ***
## date1967           12.40403    0.74902  16.560  < 2e-16 ***
## date1968           12.53720    0.74909  16.737  < 2e-16 ***
## date1969           12.63498    0.74918  16.865  < 2e-16 ***
## date1970           12.94943    0.74927  17.283  < 2e-16 ***
## date1971           13.17221    0.74935  17.578  < 2e-16 ***
## date1972           13.16700    0.74942  17.570  < 2e-16 ***
## date1973           13.59164    0.74949  18.135  < 2e-16 ***
## date1974           13.68842    0.74960  18.261  < 2e-16 ***
## date1975           13.96253    0.74959  18.627  < 2e-16 ***
## date1976           13.91502    0.74967  18.561  < 2e-16 ***
## date1977           14.34394    0.74973  19.132  < 2e-16 ***
## date1978           14.51168    0.74979  19.354  < 2e-16 ***
## date1979           14.65310    0.74985  19.541  < 2e-16 ***
## date1980           15.16758    0.74988  20.227  < 2e-16 ***
## date1981           15.37067    0.74987  20.498  < 2e-16 ***
## date1982           15.64724    0.74984  20.867  < 2e-16 ***
## date1983           15.98470    0.74982  21.318  < 2e-16 ***
## date1984           16.19476    0.74985  21.597  < 2e-16 ***
## date1985           16.45600    0.74985  21.946  < 2e-16 ***
## date1986           16.79364    0.74986  22.396  < 2e-16 ***
## date1987           16.87506    0.74989  22.503  < 2e-16 ***
## date1988           17.06481    0.74991  22.756  < 2e-16 ***
## date1989           17.50013    0.74991  23.336  < 2e-16 ***
## date1990           17.64148    0.74990  23.525  < 2e-16 ***
## date1991           17.88262    0.74984  23.849  < 2e-16 ***
## date1992           18.07015    0.74983  24.099  < 2e-16 ***
## date1993           18.13016    0.74981  24.180  < 2e-16 ***
## date1994           17.91198    0.74983  23.888  < 2e-16 ***
## date1995           17.99906    0.74993  24.001  < 2e-16 ***
## date1996           17.83637    0.75006  23.780  < 2e-16 ***
## date1997           17.67691    0.75019  23.563  < 2e-16 ***
## date1998           17.62551    0.75025  23.493  < 2e-16 ***
## date1999           17.70647    0.75030  23.599  < 2e-16 ***
## date2000           17.85390    0.75038  23.793  < 2e-16 ***
## date2001           17.99869    0.75044  23.984  < 2e-16 ***
## date2002           18.12362    0.75049  24.149  < 2e-16 ***
## date2003           18.23038    0.75056  24.289  < 2e-16 ***
## date2004           18.20683    0.75073  24.252  < 2e-16 ***
## date2005           18.35342    0.75087  24.443  < 2e-16 ***
## date2006           18.43789    0.75104  24.550  < 2e-16 ***
## date2007           18.55279    0.75121  24.697  < 2e-16 ***
## date2008           18.75883    0.75131  24.968  < 2e-16 ***
## date2009           19.24440    0.75126  25.616  < 2e-16 ***
## date2010           19.24323    0.75139  25.610  < 2e-16 ***
## date2011           19.62227    0.75146  26.112  < 2e-16 ***
## date2012           19.92723    0.75157  26.514  < 2e-16 ***
## date2013           20.21041    0.75164  26.888  < 2e-16 ***
## date2014           20.36586    0.75171  27.093  < 2e-16 ***
## date2015           20.67405    0.75177  27.501  < 2e-16 ***
## date2016           20.88265    0.75183  27.776  < 2e-16 ***
## date2017           20.99681    0.75194  27.924  < 2e-16 ***
## date2018           21.10238    0.75204  28.060  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.126 on 10506 degrees of freedom
## Multiple R-squared:  0.7947, Adjusted R-squared:  0.7931 
## F-statistic: 514.7 on 79 and 10506 DF,  p-value: < 2.2e-16
summary(data.ww2.lm.f) #Adjusted R-squared:  0.7877 
## 
## Call:
## lm(formula = life.expectancy.number ~ log10(gdp.number) + date.factor, 
##     data = data.ww2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.817  -3.532   0.253   4.152  18.505 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -12.0418     0.4393  -27.41   <2e-16 ***
## log10(gdp.number)  16.4195     0.1207  136.02   <2e-16 ***
## date.factor1950s    7.1489     0.2400   29.79   <2e-16 ***
## date.factor1960s   10.1473     0.2412   42.07   <2e-16 ***
## date.factor1970s   12.2098     0.2433   50.18   <2e-16 ***
## date.factor1980s   14.7169     0.2442   60.26   <2e-16 ***
## date.factor1990s   16.2585     0.2446   66.47   <2e-16 ***
## date.factor2000s   16.7801     0.2471   67.90   <2e-16 ***
## date.factor2010s   18.7344     0.2561   73.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.205 on 10577 degrees of freedom
## Multiple R-squared:  0.7879, Adjusted R-squared:  0.7877 
## F-statistic:  4911 on 8 and 10577 DF,  p-value: < 2.2e-16

2. Method: Modeling with different continent information:

(2-1) model with linear:

Write a function to fit a LM for each group:

continent.lm = function(data){
  lm(life.expectancy.number ~ log10(gdp.number) + date, data = data)
}

Make a nested data frame:

continent.lm.m = data.ww2 %>%
  group_by(continent) %>%
  nest()


continent.lm.m.m = map(continent.lm.m$data, continent.lm)

continent.lm.m = mutate(continent.lm.m, model = continent.lm.m.m)
continent.lm.m = mutate(continent.lm.m, .fitted = map2(data, model, add_predictions))
continent.lm.m = mutate(continent.lm.m, .resid = map2(data, model, add_residuals))
continent.lm.m.fitted = unnest(continent.lm.m, .fitted,.resid)

continent.lm.m.fitted<-continent.lm.m.fitted[,-c(10:16)]

ploting the residuals :

ggplot(continent.lm.m.fitted , aes(x=(gdp.number),y= resid))+ 
  geom_point()+ geom_smooth(method='loess',se=F)+
  scale_x_log10() +
  ggtitle("Residual Plot for Gdp per Capita and Life Expectancy")+
  labs(x='log 10 of gdp per capita')

ggplot(continent.lm.m.fitted , aes(x=(gdp.number),y= abs(resid))) + 
  geom_point()+ geom_smooth(method='loess',se=F)+
  scale_x_log10() +
  ggtitle("Absolute Residual Plot for Gdp per Capita and Life Expectancy")+
  labs(x='log 10 of gdp per capita')

Make model visuable:

continent.lm.m.fitted.df<-as.data.frame(continent.lm.m.fitted)


ggplot(continent.lm.m.fitted.df, aes(x = gdp.number, y = pred, group=date , color= date)) +
  geom_line()+scale_x_log10() +
  facet_grid(~continent)+
  ggtitle('Predicted Life Expectancy per GDP after WW2' ) + 
  labs(x= 'Log 10 GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model faceting by continents')

(1-2) model with liner : using date.factors

Write a function to fit a LM for each group:

continent.lm.f = function(data){
  lm(life.expectancy.number ~ log10(gdp.number) + date.factor, data = data)
}

Make a nested data frame:

continent.lm.m.f = data.ww2 %>%
  group_by(continent) %>%
  nest()

continent.lm.m.f.m = map(continent.lm.m.f$data, continent.lm.f)

continent.lm.m.f = mutate(continent.lm.m.f, model = continent.lm.m.f.m)
continent.lm.m.f = mutate(continent.lm.m.f, .fitted = map2(data, model, add_predictions))
continent.lm.m.f = mutate(continent.lm.m.f, .resid = map2(data, model, add_residuals))
continent.lm.m.f.fitted = unnest(continent.lm.m.f, .fitted,.resid)

continent.lm.m.f.fitted<-continent.lm.m.f.fitted[,-c(10:16)]

plotting the residuals :

ggplot(continent.lm.m.f.fitted , aes(x=(gdp.number),y= resid))+ 
  geom_point()+ geom_smooth(method='loess',se=F)+
  scale_x_log10() +
  ggtitle("Residual Plot for Gdp per Capita and Life Expectancy")+
  labs(x='log 10 of gdp per capita')

ggplot(continent.lm.m.f.fitted , aes(x=(gdp.number),y= abs(resid))) + 
  geom_point()+ geom_smooth(method='loess',se=F)+
  scale_x_log10() +
  ggtitle("Absolute Residual Plot for Gdp per Capita and Life Expectancy")+
  labs(x='log 10 of gdp per capita')

Visual graph:

ggplot(continent.lm.m.f.fitted, aes(x = gdp.number, y = pred, group = date.factor, color = date.factor)) +
  geom_line()+scale_x_log10()+
  facet_grid(~continent)+
  ggtitle('Predicted Life Expectancy per GDP after WW2' ) + 
  labs(x= 'Log 10 of GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model considering continents and seperate dates', caption = 'Faceted by Continent')

ggplot(continent.lm.m.f.fitted, aes(x = gdp.number, y = pred, group = continent, color = continent)) +
  geom_line()+scale_x_log10()+
  facet_grid(~date.factor)+
  ggtitle('Predicted Life Expectancy per GDP after WW2' ) + 
  labs(x= 'Log 10 of GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model considering continents with dates together', caption = 'Faceted by Date')

-As the date increases the GDP as well increase! -The GDP differs among continents too -And the life expectancy increases as well as the gdp increases